import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd
import glob
import json
from sklearn import cluster
from scipy.cluster import hierarchy
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
root_path = 'Experiment3/CORD-19-research-challenge/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
'pubmed_id': str,
'Microsoft Academic Paper ID': str,
'doi': str
})
meta_df.head()
| cord_uid | sha | source_x | title | doi | pmcid | pubmed_id | license | abstract | publish_time | authors | journal | Microsoft Academic Paper ID | WHO #Covidence | has_pdf_parse | has_pmc_xml_parse | full_text_file | url | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | xqhn0vbp | 1e1286db212100993d03cc22374b624f7caee956 | PMC | Airborne rhinovirus detection and effect of ul... | 10.1186/1471-2458-3-5 | PMC140314 | 12525263 | no-cc | BACKGROUND: Rhinovirus, the most common cause ... | 2003-01-13 | Myatt, Theodore A; Johnston, Sebastian L; Rudn... | BMC Public Health | NaN | NaN | True | True | custom_license | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1... |
| 1 | gi6uaa83 | 8ae137c8da1607b3a8e4c946c07ca8bda67f88ac | PMC | Discovering human history from stomach bacteria | 10.1186/gb-2003-4-5-213 | PMC156578 | 12734001 | no-cc | Recent analyses of human pathogens have reveal... | 2003-04-28 | Disotell, Todd R | Genome Biol | NaN | NaN | True | True | custom_license | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1... |
| 2 | le0ogx1s | NaN | PMC | A new recruit for the army of the men of death | 10.1186/gb-2003-4-7-113 | PMC193621 | 12844350 | no-cc | The army of the men of death, in John Bunyan's... | 2003-06-27 | Petsko, Gregory A | Genome Biol | NaN | NaN | False | True | custom_license | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1... |
| 3 | fy4w7xz8 | 0104f6ceccf92ae8567a0102f89cbb976969a774 | PMC | Association of HLA class I with severe acute r... | 10.1186/1471-2350-4-9 | PMC212558 | 12969506 | no-cc | BACKGROUND: The human leukocyte antigen (HLA) ... | 2003-09-12 | Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean... | BMC Med Genet | NaN | NaN | True | True | custom_license | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2... |
| 4 | 0qaoam29 | 5b68a553a7cbbea13472721cd1ad617d42b40c26 | PMC | A double epidemic model for the SARS propagation | 10.1186/1471-2334-3-19 | PMC222908 | 12964944 | no-cc | BACKGROUND: An epidemic of a Severe Acute Resp... | 2003-09-10 | Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine | BMC Infect Dis | NaN | NaN | True | True | custom_license | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2... |
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)
59311
class FileReader:
def __init__(self, file_path):
with open(file_path) as file:
content = json.load(file)
self.paper_id = content['paper_id']
self.abstract = []
self.body_text = []
# Abstract
for entry in content['abstract']:
self.abstract.append(entry['text'])
# Body text
for entry in content['body_text']:
self.body_text.append(entry['text'])
# if 'abstract' in content:
# for entry in content['abstract']:
# self.abstract.append(entry['text'])
# # Body text
# if 'body_text' in content:
# for entry in content['body_text']:
# self.body_text.append(entry['text'])
self.abstract = '\n'.join(self.abstract)
self.body_text = '\n'.join(self.body_text)
def __repr__(self):
return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
# Helper function adds break after every words when character length reach to certain amount. This is for the interactive plot so that hover tool fits the screen.
def get_breaks(content, length):
data = ""
words = content.split(' ')
total_chars = 0
# add break every length characters
for i in range(len(words)):
total_chars += len(words[i])
if total_chars > length:
data = data + "<br>" + words[i]
total_chars = 0
else:
data = data + " " + words[i]
return data
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
# if idx % (len(all_json) // 10000) == 0:
# print(idx)
try:
if idx % (len(all_json) // 10) == 0:
print(f'Processing index: {idx} of {len(all_json)}')
content = FileReader(entry)
# get metadata information
meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
# no metadata, skip this paper
if len(meta_data) == 0:
continue
dict_['paper_id'].append(content.paper_id)
dict_['abstract'].append(content.abstract)
dict_['body_text'].append(content.body_text)
# also create a column for the summary of abstract to be used in a plot
if len(content.abstract) == 0:
# no abstract provided
dict_['abstract_summary'].append("Not provided.")
elif len(content.abstract.split(' ')) > 100:
# abstract provided is too long for plot, take first 300 words append with ...
info = content.abstract.split(' ')[:100]
summary = get_breaks(' '.join(info), 40)
dict_['abstract_summary'].append(summary + "...")
else:
# abstract is short enough
summary = get_breaks(content.abstract, 40)
dict_['abstract_summary'].append(summary)
# get metadata information
meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
try:
# if more than one author
authors = meta_data['authors'].values[0].split(';')
if len(authors) > 2:
# more than 2 authors, may be problem when plotting, so take first 2 append with ...
dict_['authors'].append(". ".join(authors[:2]) + "...")
else:
# authors will fit in plot
dict_['authors'].append(". ".join(authors))
except Exception as e:
# if only one author - or Null valie
dict_['authors'].append(meta_data['authors'].values[0])
# add the title information, add breaks when needed
try:
title = get_breaks(meta_data['title'].values[0], 40)
dict_['title'].append(title)
# if title was not provided
except Exception as e:
dict_['title'].append(meta_data['title'].values[0])
# add the journal information
dict_['journal'].append(meta_data['journal'].values[0])
except Exception as e:
continue
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
Processing index: 0 of 59311 Processing index: 5931 of 59311 Processing index: 11862 of 59311 Processing index: 17793 of 59311 Processing index: 23724 of 59311 Processing index: 29655 of 59311 Processing index: 35586 of 59311 Processing index: 41517 of 59311 Processing index: 47448 of 59311 Processing index: 53379 of 59311 Processing index: 59310 of 59311
dict_ = None
df_covid.describe()
| paper_id | abstract | body_text | authors | title | journal | abstract_summary | |
|---|---|---|---|---|---|---|---|
| count | 36009 | 36009 | 36009 | 35413 | 35973 | 34277 | 36009 |
| unique | 36009 | 26249 | 35981 | 33538 | 35652 | 5410 | 26239 |
| top | 4ed70c27f14b7f9e6219fe605eae2b21a229f23c | In previous reports, workers have characterize... | Domingo, Esteban | In the Literature | PLoS One | Not provided. | |
| freq | 1 | 9704 | 3 | 14 | 9 | 1518 | 9704 |
df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))
df_covid.dropna(inplace=True)
df_covid = df_covid[df_covid.abstract != ''] #Remove rows which are missing abstracts
df_covid = df_covid[df_covid.body_text != ''] #Remove rows which are missing body_text
df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True) # remove duplicate rows having same abstract and body_text
metrics = ['count', 'mean', 'std', 'min', 'max']
abstract_word_count_stat = df_covid.describe(include='all')['abstract_word_count']
body_word_count_stat = df_covid.describe(include='all')['body_word_count']
print('Abstract Word Count')
for metric in metrics:
print('{} : {:.4f}'.format(metric, abstract_word_count_stat[metric]))
print('\nBody Word Count')
for metric in metrics:
print('{} : {:.4f}'.format(metric, body_word_count_stat[metric]))
Abstract Word Count count : 24584.0000 mean : 216.4467 std : 137.0651 min : 1.0000 max : 3694.0000 Body Word Count count : 24584.0000 mean : 4435.4751 std : 3657.4214 min : 23.0000 max : 232431.0000
import re
df_covid['body_text'] = df_covid['body_text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
def lower_case(input_str):
input_str = input_str.lower()
return input_str
df_covid['body_text'] = df_covid['body_text'].apply(lambda x: lower_case(x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: lower_case(x))
text = df_covid.drop(["paper_id", "abstract", "abstract_word_count", "body_word_count", "authors", "title", "journal", "abstract_summary"], axis=1)
text_arr = text.stack().tolist()
len(text_arr)
24584
words = []
for ii in range(0,len(text)):
words.append(str(text.iloc[ii]['body_text']).split(" "))
n_gram_all = []
for word in words:
# get n-grams for the instance
n_gram = []
for i in range(len(word)-2+1):
n_gram.append("".join(word[i:i+2]))
n_gram_all.append(n_gram)
from sklearn.feature_extraction.text import HashingVectorizer
# hash vectorizer instance
hvec = HashingVectorizer(lowercase=False, analyzer=lambda l:l, n_features=2**12)
# features matrix X
X = hvec.fit_transform(n_gram_all)
# Following cell may take 20-30 minutes to run
from sklearn.manifold import TSNE
#tsne = TSNE(verbose=1)
tsne = TSNE(verbose=1, perplexity=5)
X_embedded = tsne.fit_transform(X.toarray())
[t-SNE] Computing 16 nearest neighbors... [t-SNE] Indexed 24584 samples in 0.313s... [t-SNE] Computed neighbors for 24584 samples in 52.697s... [t-SNE] Computed conditional probabilities for sample 1000 / 24584 [t-SNE] Computed conditional probabilities for sample 2000 / 24584 [t-SNE] Computed conditional probabilities for sample 3000 / 24584 [t-SNE] Computed conditional probabilities for sample 4000 / 24584 [t-SNE] Computed conditional probabilities for sample 5000 / 24584 [t-SNE] Computed conditional probabilities for sample 6000 / 24584 [t-SNE] Computed conditional probabilities for sample 7000 / 24584 [t-SNE] Computed conditional probabilities for sample 8000 / 24584 [t-SNE] Computed conditional probabilities for sample 9000 / 24584 [t-SNE] Computed conditional probabilities for sample 10000 / 24584 [t-SNE] Computed conditional probabilities for sample 11000 / 24584 [t-SNE] Computed conditional probabilities for sample 12000 / 24584 [t-SNE] Computed conditional probabilities for sample 13000 / 24584 [t-SNE] Computed conditional probabilities for sample 14000 / 24584 [t-SNE] Computed conditional probabilities for sample 15000 / 24584 [t-SNE] Computed conditional probabilities for sample 16000 / 24584 [t-SNE] Computed conditional probabilities for sample 17000 / 24584 [t-SNE] Computed conditional probabilities for sample 18000 / 24584 [t-SNE] Computed conditional probabilities for sample 19000 / 24584 [t-SNE] Computed conditional probabilities for sample 20000 / 24584 [t-SNE] Computed conditional probabilities for sample 21000 / 24584 [t-SNE] Computed conditional probabilities for sample 22000 / 24584 [t-SNE] Computed conditional probabilities for sample 23000 / 24584 [t-SNE] Computed conditional probabilities for sample 24000 / 24584 [t-SNE] Computed conditional probabilities for sample 24584 / 24584 [t-SNE] Mean sigma: 0.126564 [t-SNE] KL divergence after 250 iterations with early exaggeration: 147.965591 [t-SNE] KL divergence after 1000 iterations: 4.554127
from matplotlib import pyplot as plt
import seaborn as sns
# sns settings
sns.set(rc={'figure.figsize':(15,15)})
# colors
palette = sns.color_palette("bright", 1)
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], palette=palette)
plt.title("t-SNE Covid-19 Articles")
# plt.savefig("plots/t-sne_covid19.png")
plt.show()
from sklearn.cluster import MiniBatchKMeans
k = 10
kmeans = MiniBatchKMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)
# sns settings
sns.set(rc={'figure.figsize':(15,15)})
# colors
palette = sns.color_palette("bright", len(set(y_pred)))
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title("t-SNE Covid-19 Articles - Clustered")
# plt.savefig("plots/t-sne_covid19_label.png")
plt.show()
k = 20
kmeans = MiniBatchKMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)
# sns settings
sns.set(rc={'figure.figsize':(15,15)})
# colors
palette = sns.color_palette("bright", len(set(y_pred)))
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title("t-SNE Covid-19 Articles - Clustered")
# plt.savefig("plots/t-sne_covid19_label.png")
plt.show()
# prepare n_gram_list
n_gram_list = []
for i in n_gram_all:
n_gram_list.append(" ".join(i))
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2**12)
X = vectorizer.fit_transform(n_gram_list)
from sklearn.cluster import MiniBatchKMeans
k = 10
kmeans = MiniBatchKMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)
y = y_pred
# Following cell will take 20-30 minutes to run
from sklearn.manifold import TSNE
#tsne = TSNE(verbose=1)
tsne = TSNE(verbose=1, perplexity=5)
X_embedded = tsne.fit_transform(X.toarray())
[t-SNE] Computing 16 nearest neighbors... [t-SNE] Indexed 24584 samples in 0.070s... [t-SNE] Computed neighbors for 24584 samples in 49.610s... [t-SNE] Computed conditional probabilities for sample 1000 / 24584 [t-SNE] Computed conditional probabilities for sample 2000 / 24584 [t-SNE] Computed conditional probabilities for sample 3000 / 24584 [t-SNE] Computed conditional probabilities for sample 4000 / 24584 [t-SNE] Computed conditional probabilities for sample 5000 / 24584 [t-SNE] Computed conditional probabilities for sample 6000 / 24584 [t-SNE] Computed conditional probabilities for sample 7000 / 24584 [t-SNE] Computed conditional probabilities for sample 8000 / 24584 [t-SNE] Computed conditional probabilities for sample 9000 / 24584 [t-SNE] Computed conditional probabilities for sample 10000 / 24584 [t-SNE] Computed conditional probabilities for sample 11000 / 24584 [t-SNE] Computed conditional probabilities for sample 12000 / 24584 [t-SNE] Computed conditional probabilities for sample 13000 / 24584 [t-SNE] Computed conditional probabilities for sample 14000 / 24584 [t-SNE] Computed conditional probabilities for sample 15000 / 24584 [t-SNE] Computed conditional probabilities for sample 16000 / 24584 [t-SNE] Computed conditional probabilities for sample 17000 / 24584 [t-SNE] Computed conditional probabilities for sample 18000 / 24584 [t-SNE] Computed conditional probabilities for sample 19000 / 24584 [t-SNE] Computed conditional probabilities for sample 20000 / 24584 [t-SNE] Computed conditional probabilities for sample 21000 / 24584 [t-SNE] Computed conditional probabilities for sample 22000 / 24584 [t-SNE] Computed conditional probabilities for sample 23000 / 24584 [t-SNE] Computed conditional probabilities for sample 24000 / 24584 [t-SNE] Computed conditional probabilities for sample 24584 / 24584 [t-SNE] Mean sigma: 0.171953 [t-SNE] KL divergence after 250 iterations with early exaggeration: 156.275116 [t-SNE] KL divergence after 1000 iterations: 5.079303
from matplotlib import pyplot as plt
import seaborn as sns
# sns settings
sns.set(rc={'figure.figsize':(15,15)})
# colors
palette = sns.color_palette("bright", len(set(y)))
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y, legend='full', palette=palette)
plt.title("t-SNE Covid-19 Articles - Clustered(K-Means) - Tf-idf with 2-Gram")
plt.show()
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, CustomJS
from bokeh.palettes import Category20
from bokeh.transform import linear_cmap
from bokeh.io import output_file, show
from bokeh.transform import transform
from bokeh.io import output_notebook
from bokeh.plotting import figure
from bokeh.layouts import column
from bokeh.models import RadioButtonGroup
from bokeh.models import TextInput
from bokeh.layouts import gridplot
from bokeh.models import Div
from bokeh.models import Paragraph
from bokeh.layouts import column, widgetbox
output_notebook()
y_labels = y_pred
# data sources
source = ColumnDataSource(data=dict(
x= X_embedded[:,0],
y= X_embedded[:,1],
x_backup = X_embedded[:,0],
y_backup = X_embedded[:,1],
desc= y_labels,
titles= df_covid['title'],
authors = df_covid['authors'],
journal = df_covid['journal'],
abstract = df_covid['abstract_summary'],
labels = ["C-" + str(x) for x in y_labels]
))
# hover over information
hover = HoverTool(tooltips=[
("Title", "@titles{safe}"),
("Author(s)", "@authors"),
("Journal", "@journal"),
("Abstract", "@abstract{safe}"),
],
point_policy="follow_mouse")
# map colors
mapper = linear_cmap(field_name='desc',
palette=Category20[20],
low=min(y_labels) ,high=max(y_labels))
# prepare the figure
p = figure(plot_width=800, plot_height=800,
tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset'],
title="t-SNE Covid-19 Articles, Clustered(K-Means), Tf-idf with Plain Text",
toolbar_location="right")
# plot
p.scatter('x', 'y', size=5,
source=source,
fill_color=mapper,
line_alpha=0.3,
line_color="black",
legend = 'labels')
# add callback to control
callback = CustomJS(args=dict(p=p, source=source), code="""
var radio_value = cb_obj.active;
var data = source.data;
var x = data['x'];
var y = data['y'];
var x_backup = data['x_backup'];
var y_backup = data['y_backup'];
var labels = data['desc'];
if (radio_value == '20') {
for (i = 0; i < x.length; i++) {
x[i] = x_backup[i];
y[i] = y_backup[i];
}
}
else {
for (var i = 0; i < x.length; i++) {
if(labels[i] == radio_value) {
x[i] = x_backup[i];
y[i] = y_backup[i];
} else {
x[i] = undefined;
y[i] = undefined;
}
}
}
source.change.emit();
""")
# callback for searchbar
keyword_callback = CustomJS(args=dict(p=p, source=source), code="""
var text_value = cb_obj.value;
var data = source.data;
var x = data['x'];
var y = data['y'];
var x_backup = data['x_backup'];
var y_backup = data['y_backup'];
var abstract = data['abstract'];
var titles = data['titles'];
var authors = data['authors'];
var journal = data['journal'];
for (var i = 0; i < x.length; i++) {
if(abstract[i].includes(text_value) ||
titles[i].includes(text_value) ||
authors[i].includes(text_value) ||
journal[i].includes(text_value)) {
x[i] = x_backup[i];
y[i] = y_backup[i];
} else {
x[i] = undefined;
y[i] = undefined;
}
}
source.change.emit();
""")
# option
option = RadioButtonGroup(labels=["C-0", "C-1", "C-2",
"C-3", "C-4", "C-5",
"C-6", "C-7", "C-8",
"C-9", "C-10", "C-11",
"C-12", "C-13", "C-14",
"C-15", "C-16", "C-17",
"C-18", "C-19", "All"],
active=20)
option.js_on_click(callback)
# search box
keyword = TextInput(title="Search:",)
keyword.js_on_change('value', keyword_callback)
#header
header = Div(text="""<h1>COVID-19 Literature Cluster</h1>""")
# show
show(column(header, widgetbox(option, keyword),p))